import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import plotly
import plotly.offline as pyoff
import plotly.graph_objs as go
import plotly.express as px
import chart_studio
import chart_studio.plotly as py
import calmap
import datetime
import tensorflow as tf
from datetime import date
from plotly.subplots import make_subplots
from itertools import cycle, product
from statsmodels.tsa.seasonal import STL
from scipy.stats import boxcox
from pmdarima.arima import ARIMA as pmdARIMA
from pmdarima.arima import ADFTest, KPSSTest, auto_arima
from pmdarima.utils import diff_inv
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import TimeSeriesSplit
from keras.layers import LSTM, Dense
from keras import Sequential
from keras.backend import clear_session
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from scipy.special import boxcox1p, inv_boxcox1p
import matplotlib.patches as mpatches
SEED = 84796315
PREVISOES = 5
EPOCAS = 30
BATCH_SIZE = 1000
dtOrders = pd.read_csv('../data/olist_orders_dataset.csv', encoding = 'utf8')
# Colunas do tipo data
dateColumns = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',\
'order_delivered_customer_date', 'order_estimated_delivery_date']
# Dataset de analise temporal
dtOrdersAjustado = dtOrders.copy()
# Convertendo columas de data para date
for col in dateColumns:
dtOrdersAjustado[col] = pd.to_datetime(dtOrdersAjustado[col], format = '%Y-%m-%d %H:%M:%S')
# Dropando valores NA
dtOrdersAjustado = dtOrdersAjustado.dropna()
dtOrdersAjustado.dtypes
order_id object customer_id object order_status object order_purchase_timestamp datetime64[ns] order_approved_at datetime64[ns] order_delivered_carrier_date datetime64[ns] order_delivered_customer_date datetime64[ns] order_estimated_delivery_date datetime64[ns] dtype: object
dtHistory = pd.to_datetime(dtOrdersAjustado['order_purchase_timestamp']).dt.date
start = dtHistory.min()
end = dtHistory.max()
idx = pd.date_range(start, end, normalize=True)
seriesHistory = dtHistory.value_counts(sort=False).sort_index().reindex(idx, fill_value=0)
dtHistory = pd.DataFrame(seriesHistory).reset_index()
Principais outliers identificados:
dtHistory
| index | order_purchase_timestamp | |
|---|---|---|
| 0 | 2016-09-15 | 1 |
| 1 | 2016-09-16 | 0 |
| 2 | 2016-09-17 | 0 |
| 3 | 2016-09-18 | 0 |
| 4 | 2016-09-19 | 0 |
| ... | ... | ... |
| 709 | 2018-08-25 | 69 |
| 710 | 2018-08-26 | 73 |
| 711 | 2018-08-27 | 66 |
| 712 | 2018-08-28 | 39 |
| 713 | 2018-08-29 | 11 |
714 rows × 2 columns
# Plot
# Definição dos dados no plot (Iniciando em Fevereiro de 2017 para não destorcer os dados)
plot_data = [go.Scatter(x = dtHistory['index'],
y = dtHistory['order_purchase_timestamp'])]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Periodo'},
yaxis = {"title": 'Vendas'},
title = 'Vendas por dia')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
# Remove outliers
seriesHistory = seriesHistory[datetime.date(2017, 1, 1): datetime.date(2018, 8, 17)]
pred_range = pd.date_range(datetime.date(2018, 8, 17), datetime.date(2018, 10, 17))
dtHistory = pd.DataFrame(seriesHistory).reset_index()
# Plot
# Definição dos dados no plot (Iniciando em Fevereiro de 2017 para não destorcer os dados)
plot_data = [go.Scatter(x = dtHistory['index'],
y = dtHistory['order_purchase_timestamp'])]
# Layout
plot_layout = go.Layout(xaxis = {'title': 'Periodo'},
yaxis = {"title": 'Vendas'},
title = 'Vendas por dia')
# Plot da figura
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
fig, caxs = calmap.calendarplot(seriesHistory, daylabels='MTWTFSS', fillcolor='grey',cmap='YlGn', fig_kws=dict(figsize=(18, 9)))
fig.suptitle('Histórico de Vendas', fontsize=22)
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.03, 0.67])
fig.colorbar(caxs[0].get_children()[1], cax=cbar_ax)
plt.show()
def add_stl_plot(fig, res, legend):
axs = fig.get_axes()
# Nome de cada um dos subplots
comps = ["trend", "seasonal", "resid"]
for ax, comp in zip(axs[1:], comps):
series = getattr(res, comp)
if comp == "resid":
ax.plot(series, marker="o", linestyle="none")
else:
ax.plot(series)
ax.legend(legend, frameon=False)
stl = STL(seriesHistory)
stl_res = stl.fit()
fig = stl_res.plot()
fig.set_size_inches((20, 12))
plt.show()
stl = STL(seriesHistory, robust=True)
res_robust = stl.fit()
fig = res_robust.plot()
fig.set_size_inches((20, 12))
res_non_robust = STL(seriesHistory, robust=False).fit()
add_stl_plot(fig, res_non_robust, ["Robusto", "Não Robusto"])
stl = STL(seriesHistory)
res = stl.fit()
deseasonal = res.observed - res.seasonal
bc_history, lmbda = boxcox(seriesHistory+1)
bc_history = pd.Series(bc_history, index=seriesHistory.index)
diff_history = seriesHistory.diff(7).dropna()
xi = seriesHistory.iloc[:7]
fig, axs = plt.subplots(nrows=4, sharex=True, figsize=(14, 8))
seriesHistory.plot(ax=axs[0])
axs[0].set_ylabel('Original')
deseasonal.plot(ax=axs[1])
axs[1].set_ylabel('Deseasonal')
bc_history.plot(ax=axs[2])
axs[2].set_ylabel('Boxcox')
diff_history.plot(ax=axs[3])
axs[3].set_ylabel('Stationary')
fig.align_ylabels()
fig.suptitle('Transformações')
plt.tight_layout()
plt.show()
Os testes abaixo concluiram:
ADF: O teste aceita a hipótese alternativa em que a série é estácionária.
KPSS: O teste aceita a hipótese alternativa em que a série não é estácionária.
ADF teste:
def adf_test(series):
result = adfuller(series, autolag='AIC')
print(f'ADF: {result[0]}')
print(f'steps: {result[1]}')
print(f'p-value: {result[1]}')
for key, value in result[4].items():
print('\nValor Critico:')
print(f' {key}: {value}')
adf = ADFTest(alpha = 0.05)
adf.should_diff(seriesHistory)
(0.01, False)
adf_test(seriesHistory)
ADF: -2.6163082378564737 n_lags: 0.0896733120129114 p-value: 0.0896733120129114 Valor Critico: 1%: -3.441694608475642 Valor Critico: 5%: -2.866544718556839 Valor Critico: 10%: -2.5694353738653684
adf.should_diff(diff_history)
(0.01, False)
adf_test(diff_history)
ADF: -6.80160615209054 n_lags: 2.229745766642107e-09 p-value: 2.229745766642107e-09 Valor Critico: 1%: -3.441834071558759 Valor Critico: 5%: -2.8666061267054626 Valor Critico: 10%: -2.569468095872659
KPSS teste:
kpss = KPSSTest(alpha = 0.05)
kpss.should_diff(seriesHistory)
(0.01, True)
kpss.should_diff(diff_history)
(0.1, False)
Toda a etapa de modelagem será considerada com 5 passos a frente de previsão.
result = pd.DataFrame(columns=['Algorithm', 'MSE', 'RMSE', 'MAE', 'Mean_Real_Value', 'Mean_Predict_Value'])
split_range = TimeSeriesSplit(n_splits = 8, max_train_size = pred_range.shape[0], test_size = pred_range.shape[0])
def record(result, algorithm, mse = -1, rmse = -1, mae = -1, mrv = -1, mpv = -1, show = True):
new = pd.DataFrame(dict(Algorithm = algorithm, MSE = mse, RMSE = rmse, MAE = mae, Mean_Real_Value = mrv,\
Mean_Predict_Value = mpv), index=[0])
result = pd.concat([result, new], ignore_index=True)
if show:
display(result)
return result
def plot(index, pred, mse, title, fig = None, ax = None, ylabel = ''):
global seriesHistory
empty_fig = fig is None
if empty_fig:
fig, ax = plt.subplots(figsize=(13, 6))
else:
ax.set_ylabel(ylabel)
ax.set_title(title)
patch_ = mpatches.Patch(color = 'white', label = f'MSE: {np.mean(mse):.1e}')
L1 = ax.legend(handles = [patch_], loc = 'upper left', fancybox = True, framealpha = 0.7, handlelength = 0)
ax.add_artist(L1)
sns.lineplot(x = seriesHistory.index, y = seriesHistory, label = 'Real', ax = ax)
sns.lineplot(x = index, y = pred, label = 'Previsto', ax = ax)
ax.axvline(x = index[0], color = 'red')
ax.legend(loc = 'upper right')
if empty_fig:
plt.show()
else:
return fig
'''
Correção da função diff_inv original:
https://github.com/alkaline-ml/pmdarima/issues/410
'''
def diff_inv_fix(x_diff, xi, column, steps = 1):
total_len = len(x_diff) + len(xi)
ix = pd.date_range(xi.index[0], periods = total_len)
inv = diff_inv(x_diff, steps, xi = xi) + np.fromiter(cycle(xi), count = total_len, dtype = float)
inv = pd.Series(inv, index = ix, name = column)
return inv
figs, axs = plt.subplots(nrows = 4, sharex = True, figsize = (13,6))
figs.tight_layout()
plt.close()
title = 'Original - Time Series Regression'
data = seriesHistory.copy()
mse = []
rmse = []
mae = []
mrv = []
mpv = []
for train_id, test_id in split_range.split(data):
train, y_test = data.iloc[train_id], data.iloc[test_id]
gen = TimeseriesGenerator(train, train, PREVISOES, batch_size = BATCH_SIZE)
X_train = gen[0][0]
y_train = gen[0][1]
lr = LinearRegression()
lr.fit(X_train, y_train)
X_pred = y_train[-PREVISOES:].reshape(1,-1)
y_pred = np.empty(y_test.shape[0])
for i in range(len(y_pred)):
forecast = lr.predict(X_pred)
X_pred = np.delete(X_pred, 0, 1)
X_pred = np.concatenate((X_pred, forecast.reshape(-1, 1)), 1)
y_pred[i] = forecast
mse.append(mean_squared_error(y_test, y_pred, squared = False))
rmse.append(mean_squared_error(y_test, y_pred, squared = True))
mae.append(mean_absolute_error(y_test, y_pred))
mrv.append(np.mean(y_test))
mpv.append(np.mean(y_pred))
result = record(result, title, np.mean(mse), np.mean(rmse), np.mean(mae), np.mean(mrv), np.mean(mpv), False)
plot(y_test.index, y_pred, mse, title, figs, axs[0], 'Original')
title = 'STL Deseasonal - Time Series Regression'
data = deseasonal.copy()
mse = []
rmse = []
mae = []
mrv = []
mpv = []
for train_id, test_id in split_range.split(data):
train, y_test = data.iloc[train_id], data.iloc[test_id]
gen = TimeseriesGenerator(train, train, PREVISOES, batch_size = BATCH_SIZE)
X_train = gen[0][0]
y_train = gen[0][1]
lr = LinearRegression()
lr.fit(X_train, y_train)
X_pred = y_train[-PREVISOES:].reshape(1, -1)
y_pred = np.empty(y_test.shape[0])
for i in range(len(y_pred)):
forecast = lr.predict(X_pred)
X_pred = np.delete(X_pred, 0, 1)
X_pred = np.concatenate((X_pred, forecast.reshape(-1, 1)), 1)
y_pred[i] = forecast
last_seasonal = res.seasonal.reindex_like(train).tail(stl.period)
y_pred = y_pred + np.fromiter(cycle(last_seasonal), count = y_pred.shape[0], dtype = float)
y_test = y_test + res.seasonal.reindex_like(y_test)
mse.append(mean_squared_error(y_test, y_pred, squared = False))
rmse.append(mean_squared_error(y_test, y_pred, squared = True))
mae.append(mean_absolute_error(y_test, y_pred))
mrv.append(np.mean(y_test))
mpv.append(np.mean(y_pred))
result = record(result, title, np.mean(mse), np.mean(rmse), np.mean(mae), np.mean(mrv), np.mean(mpv), False)
plot(y_test.index, y_pred, mse, title, figs, axs[1], 'Deseasonal')
title = 'STL BoxCox - Time Series Regression'
data = bc_history.copy()
mse = []
rmse = []
mae = []
mrv = []
mpv = []
for train_id, test_id in split_range.split(data):
train, y_test = data.iloc[train_id], data.iloc[test_id]
gen = TimeseriesGenerator(train, train, PREVISOES, batch_size = BATCH_SIZE)
X_train = gen[0][0]
y_train = gen[0][1]
lr = LinearRegression()
lr.fit(X_train, y_train)
X_pred = y_train[-PREVISOES:].reshape(1, -1)
y_pred = np.empty(y_test.shape[0])
for i in range(len(y_pred)):
forecast = lr.predict(X_pred)
X_pred = np.delete(X_pred, 0, 1)
X_pred = np.concatenate((X_pred, forecast.reshape(-1, 1)), 1)
y_pred[i] = forecast
y_pred = inv_boxcox1p(y_pred, lmbda)
y_test = inv_boxcox1p(y_test, lmbda)
mse.append(mean_squared_error(y_test, y_pred, squared = False))
rmse.append(mean_squared_error(y_test, y_pred, squared = True))
mae.append(mean_absolute_error(y_test, y_pred))
mrv.append(np.mean(y_test))
mpv.append(np.mean(y_pred))
result = record(result, title, np.mean(mse), np.mean(rmse), np.mean(mae), np.mean(mrv), np.mean(mpv), False)
plot(y_test.index, y_pred, mse, title, figs, axs[2], 'BoxCox')
title = 'STL Diferença Sazonal - Time Series Regression'
data = diff_history.copy()
mse = []
rmse = []
mae = []
mrv = []
mpv = []
for train_id, test_id in split_range.split(data):
train, y_test = data.iloc[train_id], data.iloc[test_id]
gen = TimeseriesGenerator(train, train, PREVISOES, batch_size = BATCH_SIZE)
X_train = gen[0][0]
y_train = gen[0][1]
lr = LinearRegression()
lr.fit(X_train, y_train)
X_pred = y_train[-PREVISOES:].reshape(1, -1)
y_pred = np.empty(y_test.shape[0])
for i in range(len(y_pred)):
forecast = lr.predict(X_pred)
X_pred = np.delete(X_pred, 0, 1)
X_pred = np.concatenate((X_pred, forecast.reshape(-1, 1)), 1)
y_pred[i] = forecast
xi = seriesHistory.reindex_like(train).tail(PREVISOES)
y_pred = diff_inv_fix(y_pred, xi, 'order_purchase_timestamp', PREVISOES).iloc[PREVISOES:]
y_test = diff_inv_fix(y_test, xi, 'order_purchase_timestamp', PREVISOES).iloc[PREVISOES:]
mse.append(mean_squared_error(y_test, y_pred, squared = False))
rmse.append(mean_squared_error(y_test, y_pred, squared = True))
mae.append(mean_absolute_error(y_test, y_pred))
mrv.append(np.mean(y_test))
mpv.append(np.mean(y_pred))
result = record(result, title, np.mean(mse), np.mean(rmse), np.mean(mae), np.mean(mrv), np.mean(mpv), False)
plot(y_test.index, y_pred, mse, title, figs, axs[3], 'BoxCox')
result
| Algorithm | MSE | RMSE | MAE | Mean_Real_Value | Mean_Predict_Value | |
|---|---|---|---|---|---|---|
| 0 | Original - Time Series Regression | 63.032946 | 5485.688752 | 45.057261 | 179.560484 | 164.066513 |
| 1 | STL Deseasonal - Time Series Regression | 57.842363 | 5070.936469 | 39.592783 | 179.560484 | 166.267257 |
| 2 | STL BoxCox - Time Series Regression | 63.453289 | 5531.781002 | 45.576572 | 179.560484 | 160.984010 |
| 3 | STL Diferença Sazonal - Time Series Regression | 123.412496 | 20765.011179 | 88.116806 | 181.296371 | 175.123977 |